Source Code Analysis

Initialization

Load Data

In [1]:
import base64
import pandas as pd
from bson.objectid import ObjectId
from singleton_decorator import singleton
from pymongo import MongoClient, WriteConcern
from sklearn.utils import random


seed=77

class Cryptor:
    @staticmethod
    def encode(target):
        b = base64.b64encode(f'sscd={target}'.encode("utf-8"))
        return str(b, "utf-8")

    @staticmethod
    def decode(target):
        return str(base64.b64decode(target), "utf-8")[5:]


@singleton
class MongoEngine:
    def __init__(self):
        ip = 'localhost'
        port = 27017
        user_name = 'sscd'
        password = 'c3NjZD1jM05qWkQxemMyTmtjSGRr'
        db_name = 'webpage_snapshot_repository'
        doc_name = 'webpage'

        password = Cryptor.decode(password)
        password = Cryptor.decode(password)

        self._mongo_client = MongoClient(f"mongodb://{user_name}:{password}@{ip}:{port}/")
        wc_majority = WriteConcern("majority", wtimeout=1000)
        self._webpage = self._mongo_client.get_database(db_name, write_concern=wc_majority)[doc_name]

    def find(self, *query):
        cursor = self._webpage.find(*query)
        result = None
        try:
            result = pd.DataFrame(list(cursor))
        finally:
            cursor.close()
        return result

    def close(self):
        self._mongo_client.close()

    def __exit__(self):
        self.close()

def get_sample_urls(portion, seed):
    engine = MongoEngine()
    ids = engine.find({"scraped": True}, {"_id": 1})

    rnd_idx = random.sample_without_replacement(
        n_population=len(ids),
        n_samples=int(len(ids) * portion),
        random_state=seed)
    ids = ids.iloc[rnd_idx,:] \
        .applymap(ObjectId) \
        .values \
        .squeeze() \
        .tolist()
    result = engine.find({"_id":{ "$in": ids}})

    return result


df = get_sample_urls(.005, seed)

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7376 entries, 0 to 7375
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   _id                7376 non-null   object
 1   url                7376 non-null   object
 2   scheme             7376 non-null   object
 3   netloc             7376 non-null   object
 4   path               7376 non-null   object
 5   params             7376 non-null   object
 6   status             7376 non-null   object
 7   scraped            7376 non-null   bool  
 8   html_text          5846 non-null   object
 9   snapshot_img_path  5846 non-null   object
 10  timestamp          7366 non-null   object
 11  comment            694 non-null    object
dtypes: bool(1), object(11)
memory usage: 641.2+ KB

Train Test Split

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit

X_train, X_test = train_test_split(df, test_size=0.3, random_state=seed)

def extract_dv(df):
    return df.status == 'SUCCESS'

display(extract_dv(X_train).head())
display(extract_dv(X_test).head())
15      False
1851     True
7136     True
3228     True
1836     True
Name: status, dtype: bool
1541    True
1324    True
5927    True
1737    True
1484    True
Name: status, dtype: bool

Feature Engineering - First Round

In [3]:
df.head()
Out[3]:
_id url scheme netloc path params status scraped html_text snapshot_img_path timestamp comment
0 5e87e296e27b5dafe7a08d30 http://www.fao.org/docrep/017/aq236e/aq236e.pdf http www.fao.org /docrep/017/aq236e/aq236e.pdf SUCCESS True <html><head></head><body></body></html> /home/jjian03/Desktop/workspace/website_qualit... 20200403223815 NaN
1 5e87e297e27b5dafe7a08e2b http://www.cancer.org/research/cancerfactsstat... http www.cancer.org /research/cancerfactsstatistics/cancerfactsfig... SUCCESS True <html xmlns="http://www.w3.org/1999/xhtml" lan... /home/jjian03/Desktop/workspace/website_qualit... 20200403223831 NaN
2 5e87e297e27b5dafe7a08fae http://www2.ed.gov/PDFDocs/college-completion/... http www2.ed.gov /PDFDocs/college-completion/12-community-colle... SUCCESS True <html lang="fr-FR" class="js"><head>\n\t<meta ... /home/jjian03/Desktop/workspace/website_qualit... 20200403223844 NaN
3 5e87e297e27b5dafe7a09002 http://www.moh.gov.ae/en/ http www.moh.gov.ae /en/ SUCCESS True <html dir="ltr" lang="en-US" xmlns:mso="urn:sc... /home/jjian03/Desktop/workspace/website_qualit... 20200403223907 NaN
4 5e87e297e27b5dafe7a09180 http://www.fda.gov/downloads/Drugs/Development... http www.fda.gov /downloads/Drugs/DevelopmentApprovalProcess/De... SUCCESS True <html xmlns="http://www.w3.org/1999/xhtml"><he... /home/jjian03/Desktop/workspace/website_qualit... 20200403225216 NaN
In [4]:
def print_uniqueValue(df):
    df_unique = pd.DataFrame()
    for col_name in df.columns:
        df_unique[col_name] = [len(df[col_name].unique())]

    df_unique['total'] = [len(df)]
    df_unique.index = ['unique count']
    display(df_unique.T)

print_uniqueValue(df)
unique count
_id 7376
url 7376
scheme 3
netloc 3175
path 6764
params 6
status 3
scraped 1
html_text 4259
snapshot_img_path 5847
timestamp 7098
comment 2
total 7376

Features in URL

Length of the url hierarchy

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
pd.options.mode.chained_assignment = None


class URLLengthCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_length'] = result['url'].apply(self._get_length)
        return result

    def _get_length(self, url):
        return len(url)


pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
])

result = pipe.transform(X_train)

display(result[['url', 'url_length']].head(5))
url url_length
15 http://arxiv:1312.7624/ 23
1851 http://www.fnt.nl/media/docs/Stuurgroep/LSKAau... 55
7136 http://orcid.org/0000-0003-0829-7569 36
3228 https://doi.org/10.1097/00063198-199807000-00002 48
1836 http://dx.doi.org/10.1002/cncr.21334 36

Depth of the url hierarchy

In [6]:
class URLDepthCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:,'url_depth'] = result['path'].apply(self._get_depth)
        return result

    def _get_depth(self, path):
        last_idx = path.rindex('/')
        if last_idx + 1 < len(path):
            last_idx = len(path)
        return path[:last_idx].count('/')

pipe = Pipeline([
    ('url_depth_counter', URLDepthCounter()),
])

result = pipe.transform(result)

display(result[['path', 'url_depth']].head(5))
path url_depth
15 / 0
1851 /media/docs/Stuurgroep/LSKAaug2014.pdf 4
7136 /0000-0003-0829-7569 1
3228 /10.1097/00063198-199807000-00002 2
1836 /10.1002/cncr.21334 2

Has WWW subdomain

In [7]:
class HasWWWConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'has_www'] = result['netloc'].apply(self._has_www)
        return result

    def _has_www(self, domain):
        return int(domain.startswith('www.'))


pipe = Pipeline([
    ('has_www_converter', HasWWWConverter()),
])

result = pipe.transform(result)

display(result[['netloc', 'has_www']].head(5))
netloc has_www
15 arxiv:1312.7624 0
1851 www.fnt.nl 1
7136 orcid.org 0
3228 doi.org 0
1836 dx.doi.org 0

Level of the Subdomain

In [8]:
class SubdomainLevelCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result.loc[:, 'subdomain_level'] = result['netloc'].apply(self._get_level)
        return result

    def _get_level(self, domain):
        return domain.count('.')


pipe = Pipeline([
    ('subdomain_level_counter', SubdomainLevelCounter()),
])

result = pipe.transform(result)

display(result[['netloc', 'subdomain_level']].head(5))
netloc subdomain_level
15 arxiv:1312.7624 1
1851 www.fnt.nl 2
7136 orcid.org 1
3228 doi.org 1
1836 dx.doi.org 2

Number of HTTP-Get parameters

In [9]:
import numpy as np


class RequestParameterCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        result['params'] = result['params'].replace(np.nan, '', regex=True)
        result.loc[:, 'param_cnt'] = result['params'].apply(self._count_param)
        return result

    def _count_param(self, params):
        if params is '':
            return 0
        return params.count('&') + 1

pipe = Pipeline([
    ('request_parameter_counter', RequestParameterCounter()),
])

result = pipe.transform(result)

display(result[['params', 'param_cnt']].head(5))
params param_cnt
15 0
1851 0
7136 0
3228 0
1836 0

Domain Suffix

In [10]:
!pip install feature_engine
Requirement already satisfied: feature_engine in /home/jjian03/anaconda3/lib/python3.7/site-packages (0.4.3)
Requirement already satisfied: numpy<1.19.0,>=1.18.2 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from feature_engine) (1.18.4)
Requirement already satisfied: scikit-learn<0.23.0,>=0.22.2 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from feature_engine) (0.22.2.post1)
Requirement already satisfied: pandas<1.1.0,>=1.0.3 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from feature_engine) (1.0.3)
Requirement already satisfied: statsmodels<0.12.0,>=0.11.1 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from feature_engine) (0.11.1)
Requirement already satisfied: scipy<1.5.0,>=1.4.1 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from feature_engine) (1.4.1)
Requirement already satisfied: joblib>=0.11 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from scikit-learn<0.23.0,>=0.22.2->feature_engine) (0.13.2)
Requirement already satisfied: python-dateutil>=2.6.1 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from pandas<1.1.0,>=1.0.3->feature_engine) (2.8.0)
Requirement already satisfied: pytz>=2017.2 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from pandas<1.1.0,>=1.0.3->feature_engine) (2019.1)
Requirement already satisfied: patsy>=0.5 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from statsmodels<0.12.0,>=0.11.1->feature_engine) (0.5.1)
Requirement already satisfied: six>=1.5 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas<1.1.0,>=1.0.3->feature_engine) (1.12.0)
In [11]:
from feature_engine import categorical_encoders


class DomainSuffixBuilder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._suffix_mapping = None

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        # Remove incorrect urls
        result = result[result['netloc'].apply(lambda x: '.' in x)]
        result.loc[:, 'suffix'] = result.netloc.apply(DomainSuffixBuilder._get_url_suffix)
        result.loc[:, 'is_port_access'] = result.suffix.apply(DomainSuffixBuilder._is_port_access)
        result.loc[:, 'suffix_idx'] = result.suffix.apply(DomainSuffixBuilder._clean_url_suffix)
        encoder = categorical_encoders.CountFrequencyCategoricalEncoder(
            encoding_method='frequency',
            variables=['suffix'])
        result = encoder.fit_transform(result)
        self._suffix_dict = encoder.encoder_dict_['suffix']
        return result

    @property
    def suffix_dict(self):
        return self._suffix_dict

    @staticmethod
    def _get_url_suffix(url):
        last_idx = url.rindex('.')
        return url[last_idx + 1:]

    @staticmethod
    def _clean_url_suffix(url):
        return url.split(':')[0]

    @staticmethod
    def _is_port_access(suffix):
        return int(len([token for token in suffix.split(':') if token.strip() != ''])>1)


pipe = Pipeline([
    ('domain_suffix_builder', DomainSuffixBuilder()),
])

result = pipe.transform(result)


display(result[['netloc', 'is_port_access', 'suffix', 'suffix_idx']].head(5))
pipe.steps[-1][1].suffix_dict
netloc is_port_access suffix suffix_idx
15 arxiv:1312.7624 0 0.000194 7624
1851 www.fnt.nl 0 0.006797 nl
7136 orcid.org 0 0.491552 org
3228 doi.org 0 0.491552 org
1836 dx.doi.org 0 0.491552 org
Out[11]:
{'org': 0.49155175762283937,
 'com': 0.14410565158283162,
 'gov': 0.09710623422023694,
 'uk': 0.03845406875121383,
 'edu': 0.03223926976111866,
 'int': 0.02000388424936881,
 'ca': 0.01611963488055933,
 'net': 0.014565935133035541,
 'au': 0.013789085259273645,
 'de': 0.012235385511749854,
 'eu': 0.010875898232666537,
 'jp': 0.01048747329578559,
 'nl': 0.006797436395416585,
 'cn': 0.006214798990095164,
 'fr': 0.0056321615847737426,
 'br': 0.005243736647892795,
 'dk': 0.004466886774130899,
 'se': 0.0038842493688094775,
 'in': 0.0036900369003690036,
 'es': 0.003301611963488056,
 'ch': 0.0029131870266071083,
 'nz': 0.0025247620897261604,
 'io': 0.0025247620897261604,
 'kr': 0.0023305496212856864,
 'za': 0.0021363371528452125,
 'it': 0.0021363371528452125,
 'at': 0.0019421246844047388,
 'be': 0.0019421246844047388,
 'ir': 0.0019421246844047388,
 'mx': 0.0019421246844047388,
 'tw': 0.0017479122159642648,
 'info': 0.001553699747523791,
 'no': 0.0013594872790833172,
 'hk': 0.0013594872790833172,
 'fi': 0.0013594872790833172,
 'il': 0.0011652748106428432,
 'pl': 0.0011652748106428432,
 'us': 0.0009710623422023694,
 'sg': 0.0009710623422023694,
 'gl': 0.0009710623422023694,
 'pk': 0.0009710623422023694,
 'my': 0.0007768498737618955,
 'cl': 0.0007768498737618955,
 'ru': 0.0007768498737618955,
 'ar': 0.0005826374053214216,
 'lk': 0.0005826374053214216,
 '': 0.0005826374053214216,
 'tr': 0.0005826374053214216,
 'ie': 0.0005826374053214216,
 'sa': 0.0005826374053214216,
 'ly': 0.0005826374053214216,
 'gr': 0.0005826374053214216,
 'np': 0.0003884249368809478,
 'th': 0.0003884249368809478,
 'cat': 0.0003884249368809478,
 'id': 0.0003884249368809478,
 'fj': 0.0003884249368809478,
 'ee': 0.0003884249368809478,
 'hu': 0.0003884249368809478,
 'lt': 0.0003884249368809478,
 'cu': 0.0003884249368809478,
 'md': 0.0003884249368809478,
 'uy': 0.0003884249368809478,
 'ng': 0.0003884249368809478,
 'nu': 0.0003884249368809478,
 'ke': 0.0003884249368809478,
 'pt': 0.0003884249368809478,
 'uk_interproscan': 0.0001942124684404739,
 'cr': 0.0001942124684404739,
 '141': 0.0001942124684404739,
 'jp:8090': 0.0001942124684404739,
 'tz': 0.0001942124684404739,
 'pdf': 0.0001942124684404739,
 'who-umc': 0.0001942124684404739,
 'ac': 0.0001942124684404739,
 'hr': 0.0001942124684404739,
 'biz': 0.0001942124684404739,
 'lb': 0.0001942124684404739,
 'pe': 0.0001942124684404739,
 'global': 0.0001942124684404739,
 'vn': 0.0001942124684404739,
 '7624': 0.0001942124684404739,
 'mk': 0.0001942124684404739,
 'rw': 0.0001942124684404739,
 'cy': 0.0001942124684404739,
 'de:8443': 0.0001942124684404739,
 'uni-mki': 0.0001942124684404739,
 'ro': 0.0001942124684404739,
 'coop': 0.0001942124684404739,
 'nicotine': 0.0001942124684404739,
 'sk': 0.0001942124684404739,
 'et': 0.0001942124684404739,
 'cz': 0.0001942124684404739,
 'esil': 0.0001942124684404739,
 'mil': 0.0001942124684404739,
 'ec': 0.0001942124684404739,
 'hm': 0.0001942124684404739,
 'ece': 0.0001942124684404739,
 'kw': 0.0001942124684404739,
 've': 0.0001942124684404739,
 'ae': 0.0001942124684404739,
 'asterix': 0.0001942124684404739,
 'si': 0.0001942124684404739,
 'sn': 0.0001942124684404739,
 'rs': 0.0001942124684404739,
 'ht': 0.0001942124684404739,
 'edy': 0.0001942124684404739,
 'nih': 0.0001942124684404739,
 '21': 0.0001942124684404739,
 'qa': 0.0001942124684404739,
 'nl:8080': 0.0001942124684404739,
 'gh': 0.0001942124684404739,
 'mp': 0.0001942124684404739,
 'bh': 0.0001942124684404739,
 'wmo': 0.0001942124684404739,
 'co': 0.0001942124684404739,
 'sl': 0.0001942124684404739,
 'doi': 0.0001942124684404739,
 'mz': 0.0001942124684404739,
 'ws': 0.0001942124684404739,
 'gt': 0.0001942124684404739}

Remove the Incorrect Domains

In [12]:
import re


class IncorrectDomainUrlCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        # TLD ranges from 2 to 63
        self._regex = re.compile(r'^[a-zA-Z]{2,63}$', re.I)

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, 'is_correct'] = result.suffix_idx.apply(self._is_correct)
        result = result[result.is_correct]
        result = result.drop('is_correct', axis=1)
        return result

    def _is_correct(self, domain_suffix):
        return True if self._regex.match(domain_suffix) else False


pipe = Pipeline([
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
])

result = pipe.transform(result)

print(f'Before changes: {len(X_train)}')
print(f'After changes: {len(result)}')
Before changes: 5163
After changes: 5140

Protocol Type Conversion

In [13]:
from feature_engine import categorical_encoders


class ColumnRenamer(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self._mapping = mapping

    @property
    def mapping(self):
        return self._mapping

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        self._mapping = {key: value for key, value in self._mapping.items() if key in result.columns}
        result = result.rename(columns=self._mapping)
        return result

pipe = Pipeline([
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
])

result = pipe.transform(result)

display(result[['url', 'protocol_type']].head(5))
url protocol_type
1851 http://www.fnt.nl/media/docs/Stuurgroep/LSKAau... http
7136 http://orcid.org/0000-0003-0829-7569 http
3228 https://doi.org/10.1097/00063198-199807000-00002 https
1836 http://dx.doi.org/10.1002/cncr.21334 http
5775 http://journal.frontiersin.org/article/10.3389... http

EDA

In [14]:
print_uniqueValue(result)
result.info()
unique count
_id 5140
url 5140
protocol_type 3
netloc 2333
path 4719
params 5
status 3
scraped 1
html_text 2991
snapshot_img_path 4098
timestamp 5000
comment 2
url_length 165
url_depth 13
has_www 2
subdomain_level 5
param_cnt 2
suffix 34
is_port_access 2
suffix_idx 111
total 5140
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5140 entries, 1851 to 6871
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   _id                5140 non-null   object 
 1   url                5140 non-null   object 
 2   protocol_type      5140 non-null   object 
 3   netloc             5140 non-null   object 
 4   path               5140 non-null   object 
 5   params             5140 non-null   object 
 6   status             5140 non-null   object 
 7   scraped            5140 non-null   bool   
 8   html_text          4097 non-null   object 
 9   snapshot_img_path  4097 non-null   object 
 10  timestamp          5136 non-null   object 
 11  comment            473 non-null    object 
 12  url_length         5140 non-null   int64  
 13  url_depth          5140 non-null   int64  
 14  has_www            5140 non-null   int64  
 15  subdomain_level    5140 non-null   int64  
 16  param_cnt          5140 non-null   int64  
 17  suffix             5140 non-null   float64
 18  is_port_access     5140 non-null   int64  
 19  suffix_idx         5140 non-null   object 
dtypes: bool(1), float64(1), int64(6), object(12)
memory usage: 808.1+ KB
In [15]:
!pip install plotly
Requirement already satisfied: plotly in /home/jjian03/anaconda3/lib/python3.7/site-packages (4.7.1)
Requirement already satisfied: six in /home/jjian03/anaconda3/lib/python3.7/site-packages (from plotly) (1.12.0)
Requirement already satisfied: retrying>=1.3.3 in /home/jjian03/anaconda3/lib/python3.7/site-packages (from plotly) (1.3.3)
In [16]:
import plotly

import plotly.graph_objects as go
from plotly.subplots import make_subplots


# pipe = Pipeline([
#     ('url_length_counter', URLLengthCounter()),
#     ('url_depth_counter', URLDepthCounter()),
#     ('has_www_converter', HasWWWConverter()),
#     ('subdomain_level_counter', SubdomainLevelCounter()),
#     ('request_parameter_counter', RequestParameterCounter()),
#     ('domain_suffix_builder', DomainSuffixBuilder()),
#     ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
#     ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
# ])

# result = pipe.transform(X_train)

non_binary_result = result[['protocol_type', 'url_length', 'url_depth', 'subdomain_level', 'param_cnt', 'suffix_idx']]

def plot_distribution(data, title, height=1200, width=800):
    fig = make_subplots(rows=len(data.columns), cols=1,
                    subplot_titles=data.columns)

    for idx, col_name in enumerate(data.columns):
        fig.add_trace(go.Histogram(x=data[col_name], name=col_name), row=idx + 1, col=1)


    fig.update_layout(height=height, width=width, title_text=title)
    return fig

plot_distribution(non_binary_result, "Non Binary Features Distribution")
In [17]:
binary_result = result[['status', 'has_www', 'is_port_access']]

plot_distribution(binary_result, "Binary Features Distribution")

Most of the non-binary feature are right skewed, it is necessary to apply the standard scaler at the later process.

Modeling

Data Cleaning

Age of the URL

In [18]:
import math
import time
import datetime
from dateutil import relativedelta


class TimeseriesConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
#         self._scraped_dt = datetime.datetime.strptime('20200513132015', "%Y%m%d%H%M%S")
        self._scraped_dt = datetime.datetime.now()

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, 'timestamp_coef'] = result.timestamp.apply(self._convert_timestamp_to_coef)
        return result

    def _convert_timestamp_to_coef(self, ts):
        if None is ts or np.nan is ts:
            return 0
        ts_str = str(ts).strip()
        if '' == ts_str:
            return 0
        ts_obj = datetime.datetime.strptime(ts_str, "%Y%m%d%H%M%S")
        return 1/math.exp(relativedelta.relativedelta(self._scraped_dt, ts_obj).months)

pipe = Pipeline([
    ('timeseries_converter', TimeseriesConverter()),
])

result = pipe.transform(result)

result.timestamp_coef.head()
Out[18]:
1851    1.0
7136    1.0
3228    1.0
1836    1.0
5775    1.0
Name: timestamp_coef, dtype: float64

Remove redundant features

In [19]:
class FeatureRemover(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self._removed_features = None
        self._features = features

    @property
    def removed_features(self):
        return self._removed_features

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        self._removed_features = [col_name for col_name in self._features if col_name in result.columns]
        result = result.drop(self._removed_features, axis=1)
        return result


class FeaturePicker(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self._picked_features = None
        self._features = features

    @property
    def picked_features(self):
        return self._picked_features

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x
        self._picked_features = [col_name for col_name in self._features if col_name in result.columns]
        result = result[self._picked_features]
        return result


pipe = Pipeline([
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status',
                                       ])),
])

result = pipe.transform(result)

result.columns
Out[19]:
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'param_cnt',
       'suffix', 'timestamp_coef', 'is_port_access', 'status'],
      dtype='object')

Miscellaneous Clean Up

  • Standardize variance
  • Convert Categorical Feature into Frequency Based Numberical Index
  • Remove low variance features
In [20]:
from itertools import compress
from sklearn import feature_selection


class LowVarianceRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self._p = threshold
        self._bi_vt = feature_selection.VarianceThreshold(threshold=threshold*(1-threshold))
        self._regular_vt = feature_selection.VarianceThreshold(threshold=threshold)
        self._dropped_columns = list()

    @property
    def threshold(self):
        return self._threshold

    @property
    def dropped_columns(self):
        return self._dropped_columns

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        result = x

        df_unique = pd.DataFrame()
        for col_name in result.columns:
            if 'status' != col_name:
                df_unique[col_name] = [len(result[col_name].unique())]

        df_unique.index = ['unique count']
        df_unique = df_unique.T.squeeze()

        bi_columns = df_unique[df_unique == 2].index.tolist()
        regular_columns = df_unique[df_unique != 2].index.tolist()

        if len(bi_columns) >0:
            self._bi_vt.fit(result[bi_columns])
            bi_mask = self._bi_vt.variances_ < self._p * (1 - self._p)            
            self._dropped_columns = self._dropped_columns + list(compress(bi_columns, bi_mask))
        if len(regular_columns) >0 :
            self._regular_vt.fit(result[regular_columns])
            regular_mask = self._regular_vt.variances_ < self._p
            self._dropped_columns = self._dropped_columns + list(compress(regular_columns, regular_mask))

        if len(self._dropped_columns) > 0:
            remover = FeatureRemover(self._dropped_columns)
            result = remover.transform(result)
        return result


pipe = Pipeline([
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    ('low_variance_remover', LowVarianceRemover(0.01))

])


result = pipe.fit_transform(result)


print(f'Before transform: {X_train.columns}\n')
print(f'After transform: {result.columns}\n')
print(f'Dropped columns: {pipe.steps[-1][1].dropped_columns}')
Before transform: Index(['_id', 'url', 'scheme', 'netloc', 'path', 'params', 'status', 'scraped',
       'html_text', 'snapshot_img_path', 'timestamp', 'comment', 'url_length',
       'url_depth', 'has_www', 'subdomain_level', 'param_cnt'],
      dtype='object')

After transform: Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef', 'status'],
      dtype='object')

Dropped columns: ['param_cnt', 'is_port_access']

The port indicator is wiped out, but I believe this could be a reason to explain the availability of the url resource, so I will separately build a subset to analyze that part later.

Add Sklearn Build-in Function

In [21]:
from sklearn import preprocessing


class CustomizedStandardizer(BaseEstimator, TransformerMixin):
    def __init__(self, norm='l2'):
        self._pipe = Pipeline([
            ('normalizer', preprocessing.Normalizer(norm=norm, copy=True)),
            ('standard_scaler', preprocessing.StandardScaler()),

        ])
        self._columns = None

    @property
    def columns(self):
        return self._columns

    def fit(self,x,y=None):
        return self

    def transform(self,x,y=None):
        self._columns = x.drop('status', axis=1).columns
        self._columns = [*self._columns, 'status']
        result = self._pipe.fit_transform(x.drop('status', axis=1))
        dv = x.status.apply(lambda v: 1 if 'SUCCESS' == v else 0).tolist()
        dv = np.array([dv]).T
        result = np.append(result, dv, axis=1)
        return result

pipe = Pipeline([
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

result = pipe.fit_transform(result)

result = pd.DataFrame(result, columns= pipe.steps[-1][1].columns)

plot_distribution(result, "Standardized Features Distribution")

Logistic Regression

In [22]:
import gc
import multiprocessing

import warnings
warnings.filterwarnings("ignore")


cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()
Allocated 8 CPUs
Out[22]:
259
In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import GaussianNB


class AnalysisEngineBuilder:

    def __init_(self):
        self._X_train = None
        self._y_train = None
        self._X_test = None
        self._y_test = None
        self._param_grid = None
        self._engine = None
    def set_X_train(self, X_train):
        self._X_train = X_train
        return self
    def set_y_train(self, y_train):
        self._y_train = y_train
        return self
    def set_X_test(self, X_test):
        self._X_test = X_test
        return self
    def set_y_test(self, y_test):
        self._y_test = y_test
        return self
    def set_param_grid(self, param_grid):
        self._param_grid = param_grid
        return self
    def set_engine(self, engine):
        self._engine = engine
        return self
    def build(self):
        return AnalysisEngineBuilder._AnalysisEngine(self._X_train, self._y_train, self._X_test, self._y_test, self._param_grid, self._engine)

    class _AnalysisEngine:
        def __init__(self, X_train, y_train, X_test, y_test, param_grid, engine):
            self._X_train = X_train
            self._y_train = y_train
            self._X_test = X_test
            self._y_test = y_test
            self._param_grid = param_grid
            self._engine = engine
            self._grid = GridSearchCV(self._engine, self._param_grid, cv=10, scoring='accuracy')
            self._pred = None
            self._pred_prob = None
            self._accuracy = None
            self._roc = None
            self._tpr = None
            self._fpr = None
        @property
        def grid_search_result(self):
            return pd.DataFrame(self._grid.cv_results_)
        @property
        def accuracy(self):
            return self._accuracy
        @property
        def roc(self):
            return self._roc
        @property
        def tpr(self):
            return self._tpr
        @property
        def fpr(self):
            return self._fpr
        @property
        def threshold(self):
            return self._threshold
        def analyze(self):
            self._grid.fit(self._X_train, self._y_train)
            self._pred = self._grid.predict(self._X_test)
            self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, self._pred)
            try:
                self._pred_prob = self._grid.predict_proba(self._X_test)
                self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, pd.DataFrame(self._pred_prob)[1])
            except AttributeError as ae:
                pass
            self._accuracy = accuracy_score(self._y_test, self._pred)
            self._roc = roc_auc_score(self._y_test, self._pred)

            return self._grid

        def show_performance(self):
            print(f"ROC/AUC: {round(self._roc*100, 2)}%")
            print()
            print(classification_report(self._y_test, self._pred, target_names=["Valid Url","Invalid"]))
In [24]:
import matplotlib


class Visualizer():
    @staticmethod
    def group_plot_roc_curve(title, data_group):
        plt.clf()
        plt.figure(figsize=(5, 5), dpi=80)

        x = [0.0, 1.0]
        plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='Naive prediction (Random guess)')
        for idx, group in enumerate(data_group):
            fpr = group[0]
            tpr = group[1]
            label = group[2]
            linestyle= 'solid'
            if idx % 2 == 1:
                linestyle= 'dashed'
            plt.plot(fpr, tpr, linestyle=linestyle, linewidth=10, label=label)

        plt.xlim(0.0, 1.0)
        plt.ylim(0.0, 1.0)
        plt.xlabel("FPR", fontsize=14)
        plt.ylabel("TPR", fontsize=14)

        plt.legend(fontsize=10, loc='lower right')

        plt.title(title, fontsize=14)
        plt.tight_layout()

        return plt

    @staticmethod
    def plot_performance(data,
                            legend_type_name,
                            x_axis_name,
                            upper_y_label,
                            lower_y_label,
                            title):
        plt.clf()
        f, ax = plt.subplots(2, 1, figsize=(15,8))
        legends = data[legend_type_name].unique()
        for idx, legend in enumerate(legends):
            _data = data[data[legend_type_name]==legend]
            ax[0].plot(_data[x_axis_name], _data[upper_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
            ax[0].set_xlabel(x_axis_name, fontsize=15)
            ax[0].set_ylabel(upper_y_label.upper(), fontsize=15)
            ax[0].legend(fontsize=10, loc='upper right')

            ax[1].plot(_data[x_axis_name], _data[lower_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
            ax[1].set_xlabel(x_axis_name, fontsize=15)
            ax[1].set_ylabel(lower_y_label.upper(), fontsize=15)
            ax[1].legend(fontsize=10, loc='lower right')

        ax[0].set_title(f"Performance Evaluation of {title}", fontsize=24)
        plt.tight_layout()

        return plt

    @staticmethod
    def plot_feature_importance(reg_coef, col_names, title):
        reg_coef = pd.Series(reg_coef, index=col_names)
        reg_coef = reg_coef.sort_values()
        matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
        reg_coef.plot(kind="barh",)
        plt.title(title, fontsize=15)

        return plt

    @staticmethod
    def plot_importance_trending(X_train, feature_importance_matrix, title, offset=3):
        feature_importance = feature_importance_matrix.groupby('C').agg(['mean'])[[*X_train.columns]]
        feature_importance.columns = X_train.columns.tolist()
        feature_importance['C'] = feature_importance.index
        
        column_names = X_train.columns
        lbds = feature_importance['C'].tolist()
        coef_matrix = feature_importance[X_train.columns]
        x_lab = 'Lambda'
        y_lab = 'Weight'
        plt.clf()
        plt.figure(figsize=(15, 10))
        for idx, col_name in enumerate(column_names):
            plt.plot(lbds, coef_matrix.iloc[:,idx], 'o-', linewidth=2, label=col_name)
            c = coef_matrix.iloc[0,idx]
            plt.annotate(col_name, (lbds[offset], coef_matrix.iloc[offset,idx]))

        plt.title(title, fontSize=25)
        plt.xlabel(x_lab)
        plt.ylabel(y_lab)

        plt.legend(loc='upper right')
        plt.tight_layout()

        return plt
In [25]:
from concurrent.futures.thread import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")
from concurrent.futures.thread import ThreadPoolExecutor
from sklearn.metrics import hinge_loss


def loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model_func, param):
    def _analyze_param_combination():
        engine = AnalysisEngineBuilder() \
                    .set_X_train(X_train) \
                    .set_y_train(y_train) \
                    .set_X_test(X_test) \
                    .set_y_test(y_test) \
                    .set_param_grid(param) \
                    .set_engine(model_func) \
                    .build()
        model = engine.analyze()
        
        # Performance scores
        loss = hinge_loss(y_test, pd.DataFrame(model.predict_proba(X_test))[1])
        auc = roc_auc_score(y_test, model.predict(X_test))
        
        coef = pd.Series(model.best_estimator_.coef_[0], index=X_test.columns).to_dict()
        _param = param
        for key, value in param.items():
            _param[key] = value[0]
        return {
            'accuracy': engine.accuracy * 100,
            'loss': loss,
            'auc': auc,
            **coef,
            **_param
        }
    return _analyze_param_combination

# Refactor into the analyzer later on
def calculate_grid_performance(X_train, y_train, X_test, y_test, params, model):
    # build combination list
    combination_list = pd.DataFrame({'dummy': [1]})
    for key, values in params.items():
        combination_list = pd.merge(combination_list, pd.DataFrame({key: values, 'dummy': [1] * len(values)}))
    combination_list.drop('dummy',axis=1, inplace=True)

    # Train and extract scores
    futures = list()
    results = list()
    # Execute models in threads
    with ThreadPoolExecutor(max_workers=allocated_cpu) as executor:
        for combination in combination_list.to_dict('records'):
            combination = {key:[value] for key, value in combination.items()}
            future_model = executor.submit(loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model, combination))
            futures.append(future_model)
        return pd.DataFrame.from_dict([future.result() for future in futures])
In [26]:
pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status'
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    # Low Variance Filter works incorrectly.
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

X = pipe.fit_transform(df)

X_train, X_test = train_test_split(X, test_size=0.3, random_state=seed)

y_train = X_train[:,-1]
X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('status', axis=1)
print(X_train.columns)



y_test = X_test[:,-1]
X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('status', axis=1)
print(X_test.columns)
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef'],
      dtype='object')
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef'],
      dtype='object')
In [27]:
from sklearn.linear_model import LogisticRegression


start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted([*np.logspace(-3, -1, 5), *np.linspace(0.001, 0.5, 20)]),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
ROC/AUC: 51.32%

              precision    recall  f1-score   support

   Valid Url       1.00      0.03      0.05       455
     Invalid       0.80      1.00      0.89      1746

    accuracy                           0.80      2201
   macro avg       0.90      0.51      0.47      2201
weighted avg       0.84      0.80      0.71      2201

--- 06 minutes, 41.73 seconds ---
In [28]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])
Out[28]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [29]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted([*np.logspace(-3, -1, 5), *np.linspace(0.001, 0.5, 20)]),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)
In [30]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)
Out[30]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [68]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', offset=15)
Out[68]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>

Try encode the suffix index label with logarithm

In [32]:
class LogarithmTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self._columns = columns

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result.loc[:, self._columns] = (result[self._columns]+0.00000000001).applymap(math.log)

        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status'
                                       ])),
    ('logarithm_transformer', LogarithmTransformer(['suffix'])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    # Low Variance Filter works incorrectly.
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

X = pipe.fit_transform(df)

X_train, X_test = train_test_split(X, test_size=0.3, random_state=seed)

y_train = X_train[:,-1]
X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('status', axis=1)
print(X_train.columns)



y_test = X_test[:,-1]
X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('status', axis=1)
print(X_test.columns)
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef'],
      dtype='object')
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef'],
      dtype='object')
In [40]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted([*np.logspace(-3, -1, 5), *np.linspace(0.001, 0.5, 20)]),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
ROC/AUC: 51.32%

              precision    recall  f1-score   support

   Valid Url       1.00      0.03      0.05       455
     Invalid       0.80      1.00      0.89      1746

    accuracy                           0.80      2201
   macro avg       0.90      0.51      0.47      2201
weighted avg       0.84      0.80      0.71      2201

--- 00 minutes, 15.03 seconds ---
In [41]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])
Out[41]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [64]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted([*np.logspace(-3, -1, 5), *np.linspace(0.001, 0.5, 20)]),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)
In [65]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)
Out[65]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [69]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', offset=15)
Out[69]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>

Try Tong's methd

In [70]:
class DummySuffixDescritizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        dummies = pd.get_dummies(result.suffix_idx)
        dummies = FeaturePicker(['int', 'org', 'gov', 'in', 'eu', 'cn']).fit_transform(dummies)
        result = result.drop('suffix_idx', axis = 1).join(dummies, how='inner')

        return result

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix_idx',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'status'
                                       ])),
    ('dummy_suffix_descritizer', DummySuffixDescritizer()),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type'])),
    # Low Variance Filter works incorrectly.
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

X = pipe.fit_transform(df)

X_train, X_test = train_test_split(X, test_size=0.3, random_state=seed)

y_train = X_train[:,-1]
X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('status', axis=1)
print(X_train.columns)



y_test = X_test[:,-1]
X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('status', axis=1)
print(X_test.columns)
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level',
       'timestamp_coef', 'int', 'org', 'gov', 'in', 'eu', 'cn'],
      dtype='object')
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level',
       'timestamp_coef', 'int', 'org', 'gov', 'in', 'eu', 'cn'],
      dtype='object')
In [71]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted([*np.logspace(-3, -1, 5), *np.linspace(0.001, 0.5, 20)]),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
ROC/AUC: 51.43%

              precision    recall  f1-score   support

   Valid Url       1.00      0.03      0.06       455
     Invalid       0.80      1.00      0.89      1746

    accuracy                           0.80      2201
   macro avg       0.90      0.51      0.47      2201
weighted avg       0.84      0.80      0.72      2201

--- 01 minutes, 4.99 seconds ---
In [72]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])
Out[72]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [73]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted([*np.logspace(-3, -1, 5), *np.linspace(0.001, 0.5, 20)]),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)
In [74]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)
Out[74]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [75]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', offset=15)
Out[75]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>

Feature Engineering - Second Round

Features in source code

  • Restore the test data
In [76]:
X_train, X_test = train_test_split(df, test_size=0.3, random_state=seed)

display(extract_dv(X_train).head())
display(extract_dv(X_test).head())
15      False
1851     True
7136     True
3228     True
1836     True
Name: status, dtype: bool
1541    True
1324    True
5927    True
1737    True
1484    True
Name: status, dtype: bool
In [82]:
# First round pipeline

pipe_1st = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('timeseries_converter', TimeseriesConverter()),
])

result = pipe_1st.fit_transform(X_train)
print(type(result))
X_train.head()
<class 'pandas.core.frame.DataFrame'>
Out[82]:
_id url scheme netloc path params status scraped html_text snapshot_img_path timestamp comment url_length url_depth has_www subdomain_level param_cnt code_size
15 5e87e298e27b5dafe7a09a95 http://arxiv:1312.7624/ http arxiv:1312.7624 / UNAVAILABLE True NaN NaN 20200403230028 NaN 23 0 0 1 0 0
1851 5e87e339e27b5dafe7a61094 http://www.fnt.nl/media/docs/Stuurgroep/LSKAau... http www.fnt.nl /media/docs/Stuurgroep/LSKAaug2014.pdf SUCCESS True <html><head>\n <meta name="robots" cont... /home/jjian03/Desktop/workspace/website_qualit... 20200407040104 NaN 55 4 1 2 0 4477
7136 5e87e5d6e27b5dafe7bc9158 http://orcid.org/0000-0003-0829-7569 http orcid.org /0000-0003-0829-7569 SUCCESS True <html><head></head><body></body></html> /home/jjian03/Desktop/workspace/website_qualit... 20200517125614 NaN 36 1 0 1 0 39
3228 5e87e3bae27b5dafe7aa5f88 https://doi.org/10.1097/00063198-199807000-00002 https doi.org /10.1097/00063198-199807000-00002 SUCCESS True <html><head>\n <meta charset="utf-8">\n ... /home/jjian03/Desktop/workspace/website_qualit... 20200408142045 NaN 48 2 0 1 0 67745
1836 5e87e337e27b5dafe7a5ff37 http://dx.doi.org/10.1002/cncr.21334 http dx.doi.org /10.1002/cncr.21334 SUCCESS True <html lang="en" class="pb-page" data-request-i... /home/jjian03/Desktop/workspace/website_qualit... 20200407021105 NaN 36 2 0 2 0 306584
Code length(kb)
In [83]:
class SourceCodeByteCounter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result['code_size'] = result.html_text \
            .replace(np.nan, '', regex=True) \
            .astype(str) \
            .apply(len)

        return result

pipe = Pipeline([
    ('source_code_byte_counter', SourceCodeByteCounter()),
])

print(type(result))

result = pipe.fit_transform(result)

result.code_size.head()
<class 'pandas.core.frame.DataFrame'>
Out[83]:
1851      4477
7136        39
3228     67745
1836    306584
5775        39
Name: code_size, dtype: int64
is HTML5
In [84]:
class HTML5Justifier(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self,x,y=None):
        return self
        
    def transform(self,x,y=None):
        result = x
        result['is_html5'] = result.html_text \
            .replace(np.nan, '', regex=True) \
            .apply(lambda x: x.split('\n', 1)[0].lower().strip() == '<!doctype html>' if x else False)

        return result

pipe = Pipeline([
    ('html5_justifier', HTML5Justifier()),
])

result = pipe.fit_transform(result)

result.is_html5.head()
Out[84]:
1851    False
7136    False
3228    False
1836    False
5775    False
Name: is_html5, dtype: bool

Text Mining

In [85]:
class BeautifulSoupParserBuilder:

    class _BeautifulSoupParser(BaseEstimator, TransformerMixin):
        def __init__(self,_lambda_pair):
            self._lambda_pair = _lambda_pair

        def fit(self,x,y=None):
            return self

        def transform(self,x,y=None):
            result = x
            for col_name, func in self._lambda_pair.items():
                result[col_name] = result.html_text \
                    .replace(np.nan, '', regex=True) \
                    .apply(lambda html_doc: BeautifulSoup(html_doc, 'html.parser')) \
                    .apply(func)

            return result

    def __init__(self):
        self._lambda_pair = dict()

    def add_lambda(self, column_name, lbd):
        self._lambda_pair[column_name] = lbd
        return self

    def build(self):
        return BeautifulSoupParserBuilder._BeautifulSoupParser(self._lambda_pair)
Title Length
In [86]:
from bs4 import BeautifulSoup


def get_title_length(soup):
    title = soup.title.string if soup.title else ''
    if not title:
        title = ''
    return len(title)
Types of the JS library
  • Extract this feature later when running the association rule.
No of JS files
In [87]:
def count_internal_js_lib(soup):
    sources=soup.findAll('script',{"src":True})
    return len([0 for source in sources if not source['src'].startswith('http')])

def count_external_js_lib(soup):
    sources=soup.findAll('script',{"src":True})
    return len([0 for source in sources if source['src'].startswith('http')])
Charset
In [88]:
def get_charset(soup):
    sources=soup.findAll('meta',{"charset":True})
    if 0 == len(sources):
        return ''
    return sources[0]['charset'].lower().replace('\'', '').replace('"', '')
iFrame in Body
In [89]:
def has_iframe(soup):
    sources=soup.findAll('iframe')
    return 0 == len(sources)
In [90]:
def count_hyperlink(soup):
    sources=soup.findAll('a')
    return len([1 for source in sources if source.has_attr('href') and source['href'].lower().startswith('http')])
Consolidate the pipeline
In [91]:
html_parser = BeautifulSoupParserBuilder() \
    .add_lambda('title_length', get_title_length) \
    .add_lambda('internal_js_cnt', count_internal_js_lib) \
    .add_lambda('external_js_cnt', count_external_js_lib) \
    .add_lambda('charset', get_charset) \
    .add_lambda('has_iframe', has_iframe) \
    .add_lambda('hyperlink_cnt', count_hyperlink) \
    .build()

pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('source_code_byte_counter', SourceCodeByteCounter()),
    ('html_parser', html_parser),
    ('html5_justifier', HTML5Justifier()),
    ('feature_picker', FeaturePicker(['protocol_type',
                                      'url_depth',
                                      'has_www',
                                      'subdomain_level',
                                      'param_cnt',
                                      'suffix',
                                      'timestamp_coef',
                                      'is_port_access',
                                      'status',
                                      'code_size',
                                      'title_length',
                                      'internal_js_cnt',
                                      'external_js_cnt',
                                      'charset',
                                      'is_html5',
                                      'has_iframe',
                                      'hyperlink_cnt',
                                       ])),
#     ('logarithm_transformer', LogarithmTransformer(['suffix'])),
#     ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
#         encoding_method='frequency',
#         variables=['protocol_type'])),
#     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
#     ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

result = pipe.fit_transform(X_train)

result[[
    'title_length',
    'internal_js_cnt',
    'external_js_cnt',
    'charset',
    'has_iframe',
    'hyperlink_cnt']].head()
Out[91]:
title_length internal_js_cnt external_js_cnt charset has_iframe hyperlink_cnt
1851 0 0 0 True 0
7136 0 0 0 True 0
3228 27 11 3 utf-8 False 14
1836 177 9 34 utf-8 False 74
5775 0 0 0 True 0
Remove tags, Tf-Idf Score of Body
Tf-Idf Score of Header

EDA - Second Round

In [92]:
# pipe = Pipeline([
#     ('url_length_counter', URLLengthCounter()),
#     ('url_depth_counter', URLDepthCounter()),
#     ('has_www_converter', HasWWWConverter()),
#     ('subdomain_level_counter', SubdomainLevelCounter()),
#     ('request_parameter_counter', RequestParameterCounter()),
#     ('domain_suffix_builder', DomainSuffixBuilder()),
#     ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
#     ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
#     ('source_code_byte_counter', SourceCodeByteCounter()),
#     ('html_parser', html_parser),
#     ('html5_justifier', HTML5Justifier()),
#     ('timeseries_converter', TimeseriesConverter()),
#     ('feature_picker', FeaturePicker(['protocol_type',
#                                       'url_depth',
#                                       'has_www',
#                                       'subdomain_level',
#                                       'param_cnt',
#                                       'suffix',
#                                       'timestamp_coef',
#                                       'is_port_access',
#                                       'status',
#                                       'code_size',
#                                       'title_length',
#                                       'internal_js_cnt',
#                                       'external_js_cnt',
#                                       'charset',
#                                       'is_html5',
#                                       'has_iframe',
#                                       'hyperlink_cnt',
#                                        ])),
# #     ('logarithm_transformer', LogarithmTransformer(['suffix'])),
# #     ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
# #         encoding_method='frequency',
# #         variables=['protocol_type'])),
# #     ('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
# #     ('standard_scaler', CustomizedStandardizer(norm='l2')),

# ])

# result = pipe.fit_transform(X_train)
result.loc[:,'charset'] = result.loc[:,'charset'].apply(lambda x: x.replace('\'', '').replace('"', ''))
result.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5140 entries, 1851 to 6871
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   protocol_type    5140 non-null   object 
 1   url_depth        5140 non-null   int64  
 2   has_www          5140 non-null   int64  
 3   subdomain_level  5140 non-null   int64  
 4   param_cnt        5140 non-null   int64  
 5   suffix           5140 non-null   float64
 6   is_port_access   5140 non-null   int64  
 7   status           5140 non-null   object 
 8   code_size        5140 non-null   int64  
 9   title_length     5140 non-null   int64  
 10  internal_js_cnt  5140 non-null   int64  
 11  external_js_cnt  5140 non-null   int64  
 12  charset          5140 non-null   object 
 13  is_html5         5140 non-null   bool   
 14  has_iframe       5140 non-null   bool   
 15  hyperlink_cnt    5140 non-null   int64  
dtypes: bool(2), float64(1), int64(10), object(3)
memory usage: 612.4+ KB
In [93]:
plot_distribution(result, "Features Distribution", height=2400)

No site uses HTML5

Revise the pipeline and generate the cleaned dataset

In [94]:
pipe = Pipeline([
    ('url_length_counter', URLLengthCounter()),
    ('url_depth_counter', URLDepthCounter()),
    ('has_www_converter', HasWWWConverter()),
    ('subdomain_level_counter', SubdomainLevelCounter()),
    ('request_parameter_counter', RequestParameterCounter()),
    ('domain_suffix_builder', DomainSuffixBuilder()),
    ('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
    ('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
    ('source_code_byte_counter', SourceCodeByteCounter()),
    ('html_parser', html_parser),
    ('timeseries_converter', TimeseriesConverter()),
    ('feature_picker', FeaturePicker([
                                        'protocol_type',
                                        'url_depth',
                                        'has_www',
                                        'subdomain_level',
                                        'param_cnt',
                                        'suffix',
                                        'timestamp_coef',
                                        'is_port_access',
                                        'code_size',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'charset',
                                        'has_iframe',
                                        'hyperlink_cnt',
                                        'status',
                                       ])),
    ('feature_remover', FeatureRemover([
                                        'param_cnt',
                                        'is_port_access',
                                       ])),
    ('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
        encoding_method='frequency',
        variables=['protocol_type', 'charset'])),
    ('logarithm_transformer', LogarithmTransformer([
                                        'suffix',
                                        'title_length',
                                        'internal_js_cnt',
                                        'external_js_cnt',
                                        'hyperlink_cnt',
                                        'protocol_type',
                                        'charset'
    ])),
    ('standard_scaler', CustomizedStandardizer(norm='l2')),

])

X_train, X_test = train_test_split(df, test_size=0.1, random_state=seed)


X_train = pipe.fit_transform(X_train)
X_test = pipe.fit_transform(X_test)


X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
y_train = X_train.loc[:,'status'].astype(int)
X_train = X_train.drop('status', axis=1)
print(X_train.columns)

X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
y_test = X_test.loc[:,'status'].astype(int)
X_test = X_test.drop('status', axis=1)
print(X_test.columns)


df_train = X_train.copy()
df_train['label'] = y_train

df_test = X_test.copy()
df_test['label'] = y_test

df_train.to_csv('df_train.csv')
df_test.to_csv('df_test.csv')
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef', 'code_size', 'title_length', 'internal_js_cnt',
       'external_js_cnt', 'charset', 'has_iframe', 'hyperlink_cnt'],
      dtype='object')
Index(['protocol_type', 'url_depth', 'has_www', 'subdomain_level', 'suffix',
       'timestamp_coef', 'code_size', 'title_length', 'internal_js_cnt',
       'external_js_cnt', 'charset', 'has_iframe', 'hyperlink_cnt'],
      dtype='object')
In [101]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-4, -3, 50)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
ROC/AUC: 98.73%

              precision    recall  f1-score   support

   Valid Url       1.00      0.97      0.99       158
     Invalid       0.99      1.00      1.00       574

    accuracy                           0.99       732
   macro avg       1.00      0.99      0.99       732
weighted avg       0.99      0.99      0.99       732

--- 02 minutes, 53.02 seconds ---
In [102]:
param_lr = {
    'l1_ratio': [0, *np.logspace(-3, 0, 5)],
    'C': sorted(np.logspace(-4, -3, 50)),
    'max_iter': np.arange(10,80,40),
}

lr = LogisticRegression(random_state=seed,
                        penalty='elasticnet',
                        solver='saga',
                        multi_class='ovr',
                        warm_start=False,
                        n_jobs=allocated_cpu,
)

loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)
In [103]:
Visualizer.plot_performance(data=loss_accuracy_matrix,
                    legend_type_name='l1_ratio',
                    x_axis_name='C',
                    upper_y_label='loss',
                    lower_y_label='auc',
                    title='Loss& Accuracy - Logistic Regression'
)
Out[103]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [104]:
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature', 40)
Out[104]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
  • Select the hyperparameter and train again
In [112]:
start_time = time.time()
param_lr = {
    'l1_ratio': [0],
    'C': [.0002],
    'max_iter': [80],
}

lr = LogisticRegression(random_state=seed,
                       penalty='elasticnet',
                       solver='saga',
                       multi_class='ovr',
                       warm_start=False,
                       n_jobs=allocated_cpu,
)


# Start to train model
engine_lr = AnalysisEngineBuilder() \
    .set_X_train(X_train) \
    .set_y_train(y_train) \
    .set_X_test(X_test) \
    .set_y_test(y_test) \
    .set_param_grid(param_lr) \
    .set_engine(lr) \
    .build()

model_lr = engine_lr.analyze()
engine_lr.show_performance()

t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
ROC/AUC: 83.86%

              precision    recall  f1-score   support

   Valid Url       1.00      0.68      0.81       158
     Invalid       0.92      1.00      0.96       574

    accuracy                           0.93       732
   macro avg       0.96      0.84      0.88       732
weighted avg       0.94      0.93      0.93       732

--- 00 minutes, 0.28 seconds ---
In [106]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

engine_lr.fpr

Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
    (engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])
Out[106]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
<Figure size 432x288 with 0 Axes>
In [107]:
Visualizer.plot_feature_importance(
    model_lr.best_estimator_.coef_[0], X_train.columns, 
    "Coefficients in the Logistic Regression")
Out[107]:
<module 'matplotlib.pyplot' from '/home/jjian03/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [ ]: